package com.suyunyou.spider.utils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.suyunyou.spider.data.LinkData;
import com.suyunyou.spider.plugins.IPagePlugin;
import com.suyunyou.spider.plugins.page.PageDtlPlugin;

/**
 * 爬虫的添加网站、提取规则、重置网站等
 * @author yuejing
 * @date 2016年6月26日 下午5:19:39
 * @version V1.0.0
 */
public class SpiderUtil {

	/**
	 * 获取域名的正则
	 */
	private static final String DOMAIN_REG = "[^http]*?\\.(com|cn|net|org|biz|info|cc|tv)";

	/**
	 * 添加网站地址
	 * @param link
	 * @param siteId
	 */
	public static void addSite(String link, Integer siteId) {
		LinkData.addSite(link, siteId);

		//将网站域名添加待分析的地址
		//FetcherPageLinkData.addSite(link);
	}

	/**
	 * 添加提取文章详情规则
	 * @param regex
	 * @param titleSelect
	 * @param contentSelect
	 */
	public static void addSiteFetcherPage(String regex, String titleSelect, String contentSelect) {
		//还需要修改处理的
		IPagePlugin plugin = new PageDtlPlugin(regex, titleSelect, contentSelect);
		PluginUtil.addIPagePlugin(plugin);
	}

	/**
	 * 移除所有提取文章详情规则
	 * @param regex
	 * @param titleSelect
	 * @param contentSelect
	 */
	public static void clearPluginsFetcherPage() {
		//还需要修改处理的
		PluginUtil.clearPlugins();
	}
	
	/**
	 * 获取域名（如：http://www.suyunyou.com）
	 * @param link
	 * @return
	 */
	public static String getDomain(String link) {
		Pattern p = Pattern.compile(DOMAIN_REG, Pattern.CASE_INSENSITIVE);
		Matcher matcher = p.matcher(link);
		while (matcher.find()) {
			String domain = matcher.group();
			int endLen = link.indexOf(domain);
			return link.substring(0, endLen + domain.length());
		}
		return link;
	}

	public static void main(String[] args) {
		String url = "https://mp.qq.suyunyou.com/sfsdf/sflsjf/?sfsd=&dsf=sdf";
		String domain = getDomain(url);
		System.out.println(domain);
	}
}